examples: fix examples/web_crawler/web_crawler.v to fetch from https://news.ycombinator.com/ instead of tuicool.com (which is offline) (fix #21854)

2025-09-14 06:52:36 +03:00 · 2024-07-17 17:58:15 +03:00 · 2024-07-17 17:58:15 +03:00 · be7b24d87b
commit be7b24d87b
parent e9652f86c5
2 changed files with 15 additions and 21 deletions
--- a/examples/web_crawler/README.md
+++ b/examples/web_crawler/README.md
@ -1,7 +1,5 @@
 # web_crawler
-web_crawler is a very simple web crawler.  
+This simple web crawler fetches news from the homepage of HN (https://news.ycombinator.com/).
 This web crawler fetches news from tuicool.com,
 (a chinese site similar to hacker-news.firebaseio.com).  
 # Compile and Run
@ -19,4 +17,4 @@ v run web_crawler.v
 This project shows how to use http.fetch() to get http.Response, 
 and then html.parse() to parse the returned html.
-It's easy, isn't it?  
+It's easy, isn't it?
--- a/examples/web_crawler/web_crawler.v
+++ b/examples/web_crawler/web_crawler.v
@ -2,24 +2,20 @@ import net.http
 import net.html
 fn main() {
-	// http.fetch() sends an HTTP request to the URL with the given method and configurations.
+	site_url := 'https://news.ycombinator.com'
-	config := http.FetchConfig{
+	resp := http.fetch(
 		url: site_url
 		user_agent: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
-	}
+	)!
 	resp := http.fetch(http.FetchConfig{ ...config, url: 'https://tuicool.com' }) or {
 		println('failed to fetch data from the server')
 		return
 	}
 	// html.parse() parses and returns the DOM from the given text.
 	mut doc := html.parse(resp.body)
-	// html.DocumentObjectModel.get_tags_by_attribute_value() retrieves all tags in the document that have the given attribute name and value.
+	tags := doc.get_tags_by_attribute_value('class', 'titleline')
-	tags := doc.get_tags_by_attribute_value('class', 'list_article_item')
+	for i, tag in tags {
-	for tag in tags {
+		el := tag.children[0]
-		el := tag.children[1].children[0].children[0].children[0]
+		mut href := el.attributes['href']!
-		href := el.attributes['href'] or { panic('key not found') }
+		if !href.starts_with('http') {
-		title := el.attributes['title'] or { panic('key not found') }
+			href = '${site_url}/${href}'
-		println('href: ${href}')
+		}
-		println('title: ${title}')
+		title := el.content
-		println('')
+		println('${i + 1:2}. title: ${title:-90s} href: ${href}')
 	}
 }