mirror of
https://github.com/vlang/v.git
synced 2025-09-14 15:02:33 +03:00
examples: fix examples/web_crawler/web_crawler.v to fetch from https://news.ycombinator.com/ instead of tuicool.com (which is offline) (fix #21854)
This commit is contained in:
parent
e9652f86c5
commit
be7b24d87b
2 changed files with 15 additions and 21 deletions
|
@ -1,7 +1,5 @@
|
||||||
# web_crawler
|
# web_crawler
|
||||||
web_crawler is a very simple web crawler.
|
This simple web crawler fetches news from the homepage of HN (https://news.ycombinator.com/).
|
||||||
This web crawler fetches news from tuicool.com,
|
|
||||||
(a chinese site similar to hacker-news.firebaseio.com).
|
|
||||||
|
|
||||||
# Compile and Run
|
# Compile and Run
|
||||||
|
|
||||||
|
|
|
@ -2,24 +2,20 @@ import net.http
|
||||||
import net.html
|
import net.html
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
// http.fetch() sends an HTTP request to the URL with the given method and configurations.
|
site_url := 'https://news.ycombinator.com'
|
||||||
config := http.FetchConfig{
|
resp := http.fetch(
|
||||||
|
url: site_url
|
||||||
user_agent: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
|
user_agent: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
|
||||||
}
|
)!
|
||||||
resp := http.fetch(http.FetchConfig{ ...config, url: 'https://tuicool.com' }) or {
|
|
||||||
println('failed to fetch data from the server')
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// html.parse() parses and returns the DOM from the given text.
|
|
||||||
mut doc := html.parse(resp.body)
|
mut doc := html.parse(resp.body)
|
||||||
// html.DocumentObjectModel.get_tags_by_attribute_value() retrieves all tags in the document that have the given attribute name and value.
|
tags := doc.get_tags_by_attribute_value('class', 'titleline')
|
||||||
tags := doc.get_tags_by_attribute_value('class', 'list_article_item')
|
for i, tag in tags {
|
||||||
for tag in tags {
|
el := tag.children[0]
|
||||||
el := tag.children[1].children[0].children[0].children[0]
|
mut href := el.attributes['href']!
|
||||||
href := el.attributes['href'] or { panic('key not found') }
|
if !href.starts_with('http') {
|
||||||
title := el.attributes['title'] or { panic('key not found') }
|
href = '${site_url}/${href}'
|
||||||
println('href: ${href}')
|
}
|
||||||
println('title: ${title}')
|
title := el.content
|
||||||
println('')
|
println('${i + 1:2}. title: ${title:-90s} href: ${href}')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue