Preparation
Create a projectNow that the preparations are done, let's start creating the project.
Wuhan University School of Computer Science News Crawler CodeThe following is the crawler code for the news of the School of Computer Science of Wuhan University. Copy it to the created .js file and save it. var http = require('http'); var fs = require('fs'); var cheerio = require('cheerio'); var request = require('request'); var i = 0; //Initial url var url = "http://cs.whu.edu.cn/a/xinwendongtaifabu/2018/0428/7053.html"; function fetchPage(x) { //Encapsulates a layer of function startRequest(x); } function startRequest(x) { //Use http module to initiate a get request to the server http.get(x, function (res) { var html = ''; //Used to store the entire HTML content of the requested web page var titles = []; res.setEncoding('utf-8'); //Prevent Chinese garbled characters //Listen to data events and take a piece of data at a time res.on('data', function (chunk) { html += chunk; }); //Listen for the end event. If the HTML of the entire webpage content is obtained, execute the callback function res.on('end', function () { var $ = cheerio.load(html); //Use cheerio module to parse html var news_item = { //Get the title of the article title: $('div#container dt').text().trim(), i: i = i + 1, }; console.log(news_item); //Print news information var news_title = $('div#container dt').text().trim(); savedContent($,news_title); //Store the content and title of each article savedImg($,news_title); //Store the image and title of each article //URL of the next article var nextLink="http://cs.whu.edu.cn" + $("dd.Paging a").attr('href'); str1 = nextLink.split('-'); //Remove the Chinese characters after the url str = encodeURI(str1[0]); //This is one of the highlights. By controlling I, you can control how many articles to crawl. Wuhan University has only 8 articles, so it is set to 8 if (i <= 8) { fetchPage(str); } }); }).on('error', function (err) { console.log(err); }); } //The function is to store the crawled news content resources locally function savedContent($, news_title) { $('dd.info').each(function (index, item) { var x = $(this).text(); var y = x.substring(0, 2).trim(); if (y == '') { x = x + '\n'; //Add the news text content to the /data folder piece by piece, and name the file with the title of the news fs.appendFile('./data/' + news_title + '.txt', x, 'utf-8', function (err) { if (err) { console.log(err); } }); } }) } //The function is to store the crawled image resources locally function savedImg($,news_title) { $('dd.info img').each(function (index, item) { var img_title = $(this).parent().next().text().trim(); //Get the title of the image if(img_title.length>35||img_title==""){ img_title="Null"; } var img_filename = img_title + '.jpg'; var img_src = 'http://cs.whu.edu.cn' + $(this).attr('src'); //Get the URL of the image //Use the request module to initiate a request to the server to obtain image resources request.head(img_src,function(err,res,body){ if(err){ console.log(err); } }); request(img_src).pipe(fs.createWriteStream('./image/'+news_title + '---' + img_filename)); //Write the image to the local /image directory through streaming, and use the title of the news and the title of the image as the name of the image. }) } fetchPage(url); //The main program starts running Now comes the exciting moment. In the current folder, run the created js file, for example, mine is news.js.
Text resources: Image resources: Caoliu Technology Forum Crawler I was not satisfied after crawling the news of Wuhan University, so I tried to crawl the technical discussion forum of Caoliu (of course, I can also crawl some things you understand). There were some problems encountered. var url = { hostname: 'cl.5fy.xyz', path: '/thread0806.php?fid=7', headers: { 'Content-Type': 'text/html', //Without this field, you cannot access it 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36', }}; Secondly, nodejs only supports crawling websites with utf-8 character encoding, so you need to install additional packages to convert the encoding, so modify the code as follows /* * @Author: user * @Date: 2018-04-28 19:34:50 * @Last Modified by: user * @Last Modified time: 2018-04-30 21:35:26 */ var http = require('http'); var fs = require('fs'); var cheerio = require('cheerio'); var request = require('request'); var iconv = require('iconv-lite'); var i = 0; //Used to determine whether to store or access var temp=0; let startPage=3;//Which page to start crawling from let page=startPage; let endPage=5;//Which page to crawl let searchText='';//Crawled keywords, all crawled by default, according to your needs //Initial url var url = { hostname: '1024liuyouba.tk', path: '/thread0806.php?fid=16'+'&search=&page='+startPage, headers: { 'Content-Type': 'text/html', //Without this field, you cannot access 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36', }}; //Store the home page url urlList=[]; //Encapsulates a layer of function function fetchPage(x) { setTimeout(function(){ startRequest(x); },5000) } //First store the url of the interface to be accessed function getUrl(x){ temp++; http.get(x,function(res){ var html = ''; res.setEncoding('binary'); res.on('data', function (chunk) { html += chunk; }); res.on('end', function () { var buf = new Buffer(html,'binary'); var str=iconv.decode(buf,'GBK'); var $ = cheerio.load(str); //Use cheerio module to parse HTML $('tr.tr3 td.tal h3 a').each(function(){ var search = $(this).text(); if(search.indexOf(searchText)>=0){ var nextLink="http://cl.5fy.xyz/" + $(this).attr('href'); str1 = nextLink.split('-'); //Remove the Chinese characters after the url str = encodeURI(str1[0]); urlList.push(str); } }) page++; if(page<endPage){ //Store the next page URL x.path='/thread0806.php?fid=16'+'&search=&page='+page, getUrl(x); }else if(urlList.length!=0){ fetchPage(urlList.shift()); }else{ console.log('No keywords found!'); } }) }).on('error', function (err) { console.log(err); }); } function startRequest(x) { if(temp===0){ getUrl(x); } else{ //Use http module to initiate a get request to the server http.get(x, function (res) { var html = ''; //Used to store the entire HTML content of the requested web page res.setEncoding('binary'); var titles = []; //Listen to the data event and fetch a chunk of data at a time res.on('data', function (chunk) { html += chunk; }); //Listen for the end event. If the HTML of the entire webpage content is obtained, execute the callback function res.on('end', function () { var buf = new Buffer(html,'binary'); var str=iconv.decode(buf,'GBK'); var $ = cheerio.load(str); //Use cheerio module to parse HTML var news_item = { //Get the title of the article title: $('h4').text().trim(), //i is used to determine how many articles have been obtained i: i = i + 1, }; console.log(news_item); //Print information var news_title = $('h4').text().trim(); savedContent($,news_title); //Store the content and title of each article savedImg($,news_title); //Store the image and title of each article //If the access is not completed, continue to access if (urlList.length!=0) { fetchPage(urlList.shift()); } }); }).on('error', function (err) { console.log(err); }); } } //The function is to store the crawled text content resources locally function savedContent($, news_title) { $("div.t2[style].tpc_content.do_not_catch").each(function (index, item) { var x = $(this).text(); x = x + '\n'; //Add the news text content to the /data folder piece by piece, and name the file with the title of the news fs.appendFile('./data/' + news_title + '.txt', x, 'utf-8', function (err) { if (err) { console.log(err); } }); }) } //The function is to store the crawled image resources locally function savedImg($,news_title) { //Create a folder fs.mkdir('./image/'+news_title, function (err) { if(err){console.log(err)} }); $('.tpc_content.do_not_catch input[src]').each(function (index, item) { var img_title = index; //Add a number to each picture var img_filename = img_title + '.jpg'; var img_src = $(this).attr('src'); //Get the URL of the image //Use the request module to initiate a request to the server to obtain image resources request.head(img_src,function(err,res,body){ if(err){ console.log(err); } }); setTimeout(function(){ request({uri: img_src, encoding: 'binary'}, function (error, response, body) { if (!error && response.statusCode == 200) { fs.writeFile('./image/'+news_title+'/' + img_filename, body, 'binary', function (err) { if(err){console.log(err)} }); } }) }); }) } fetchPage(url); //The main program starts running Results: This is the end of this article about how to make a simple crawler with node.js. For more information about making a simple crawler with node.js, please search for previous articles on 123WORDPRESS.COM or continue to browse the following related articles. I hope you will support 123WORDPRESS.COM in the future! You may also be interested in:
|
<<: SSM implements the mysql database account password ciphertext login function
>>: Detailed explanation of MYSQL database table structure optimization method
1. The Chinese garbled characters appear in MySQL...
Summarize Global environment ➡️ window Normal fun...
There is often a scenario where the image needs t...
Table of contents Add traffic function to github+...
Table of contents Preparation Install VMware Work...
Collapsed headers are a great solution for displa...
Table of contents 1. Introduction 2. Aggregation ...
Alibaba Cloud Server cannot connect to FTP FileZi...
In Linux operation and configuration work, dual n...
1. Project Structure 2.CallTomcat.java package co...
This article uses Vue to simply implement the cha...
What is a big deal? Transactions that run for a l...
Error description: 1. After installing Nginx (1.1...
Three-way handshake phase Number of retries for c...
In order to centrally manage the images we create...