examples: fix examples/web_crawler/web_crawler.v to fetch from https://news.ycombinator.com/ instead of tuicool.com (which is offline) (fix #21854)

This commit is contained in:
Delyan Angelov 2024-07-17 17:58:15 +03:00
parent e9652f86c5
commit be7b24d87b
No known key found for this signature in database
GPG key ID: 66886C0F12D595ED
2 changed files with 15 additions and 21 deletions

View file

@ -1,7 +1,5 @@
# web_crawler # web_crawler
web_crawler is a very simple web crawler. This simple web crawler fetches news from the homepage of HN (https://news.ycombinator.com/).
This web crawler fetches news from tuicool.com,
(a chinese site similar to hacker-news.firebaseio.com).
# Compile and Run # Compile and Run
@ -19,4 +17,4 @@ v run web_crawler.v
This project shows how to use http.fetch() to get http.Response, This project shows how to use http.fetch() to get http.Response,
and then html.parse() to parse the returned html. and then html.parse() to parse the returned html.
It's easy, isn't it? It's easy, isn't it?

View file

@ -2,24 +2,20 @@ import net.http
import net.html import net.html
fn main() { fn main() {
// http.fetch() sends an HTTP request to the URL with the given method and configurations. site_url := 'https://news.ycombinator.com'
config := http.FetchConfig{ resp := http.fetch(
url: site_url
user_agent: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0' user_agent: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
} )!
resp := http.fetch(http.FetchConfig{ ...config, url: 'https://tuicool.com' }) or {
println('failed to fetch data from the server')
return
}
// html.parse() parses and returns the DOM from the given text.
mut doc := html.parse(resp.body) mut doc := html.parse(resp.body)
// html.DocumentObjectModel.get_tags_by_attribute_value() retrieves all tags in the document that have the given attribute name and value. tags := doc.get_tags_by_attribute_value('class', 'titleline')
tags := doc.get_tags_by_attribute_value('class', 'list_article_item') for i, tag in tags {
for tag in tags { el := tag.children[0]
el := tag.children[1].children[0].children[0].children[0] mut href := el.attributes['href']!
href := el.attributes['href'] or { panic('key not found') } if !href.starts_with('http') {
title := el.attributes['title'] or { panic('key not found') } href = '${site_url}/${href}'
println('href: ${href}') }
println('title: ${title}') title := el.content
println('') println('${i + 1:2}. title: ${title:-90s} href: ${href}')
} }
} }