using DotnetSpider.DataFlow.Parser; using DotnetSpider.DataFlow; using DotnetSpider.Downloader; using DotnetSpider.Http; using DotnetSpider.Scheduler.Component; using DotnetSpider.Selector; using DotnetSpider; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Serilog; using DotnetSpider.Scheduler; using Microsoft.Extensions.Hosting; using System.Reflection;
namespace DotnetSpiderExercise { public class RecommendedRankingSpider : Spider { public RecommendedRankingSpider(IOptions<SpiderOptions> options, DependenceServices services, ILogger<Spider> logger) : base(options, services, logger) { }
public static async Task RunAsync() { var builder = Builder.CreateDefaultBuilder<RecommendedRankingSpider>(); builder.UseSerilog(); builder.UseDownloader<HttpClientDownloader>(); builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>(); await builder.Build().RunAsync(); }
class Parser : DataParser { public override Task InitializeAsync() { return Task.CompletedTask; }
protected override Task ParseAsync(DataFlowContext context) { var recommendedRankingList = new List<RecommendedRankingModel>(); // 網(wǎng)頁(yè)數(shù)據(jù)解析 var number = 1; var recommendedList = context.Selectable.SelectList(Selectors.XPath(".//article[@class='post-item']")); foreach (var news in recommendedList) { var articleTitle = news.Select(Selectors.XPath(".//a[@class='post-item-title']"))?.Value; var articleSummary = news.Select(Selectors.XPath(".//p[@class='post-item-summary']"))?.Value?.Replace("\n", "").Replace(" ", ""); var articleUrl = news.Select(Selectors.XPath(".//a[@class='post-item-title']/@href"))?.Value;
using (StreamWriter sw = new StreamWriter("RecommendedRanking.txt")) { foreach (RecommendedRankingModel model in recommendedRankingList) { string line = $"文章標(biāo)題:{model.ArticleTitle}\r\n文章簡(jiǎn)介:{model.ArticleSummary}\r\n文章地址:{model.ArticleUrl}"; sw.WriteLine(line + "\r\n ========================================================================================== \r\n"); } } return Task.CompletedTask; } } } }
Program執(zhí)行數(shù)據(jù)抓取
namespace DotnetSpiderExercise { public class Program { static async Task Main(string[] args) { Console.WriteLine("網(wǎng)頁(yè)數(shù)據(jù)抓取開(kāi)始...");